# Run cluster analysis for survey



# Setup ----
library(tidyverse)
library(xtable)

# Load survey data as "a"
# SOURCE: please directly contact the authors of You and Kousky (2024)






#####################################################################
# Table 1: Cluster Analysis of Funding Sources
#####################################################################




# only keep funding source variables -------------------------------------------
b = a[, grepl("fundingsource_", colnames(a), fixed=TRUE)==T]
colnames(b) = substr(colnames(b), 15, nchar(colnames(b)))
b$AnyIns = NULL
b$number = NULL
b$number_new = NULL


# k-means cluster analysis, try 1-6 # of clusters and pick the optimal # of clusters
set.seed(1135)
km1 <- kmeans(b, 1)
km2 <- kmeans(b, 2)
km3 <- kmeans(b, 3)
km4 <- kmeans(b, 4)
km5 <- kmeans(b, 5)
km6 <- kmeans(b, 6)

var.exp <- data.frame(k = c(1:6),
                      bss_tss = c(km1$betweenss/km1$totss,
                                  km2$betweenss/km2$totss,
                                  km3$betweenss/km3$totss,
                                  km4$betweenss/km4$totss,
                                  km5$betweenss/km5$totss,
                                  km6$betweenss/km6$totss))

var.exp     

# visually look at information gain from 1-6 # of clusters ---------------------
ggplot(data = var.exp, aes(x = k, y = bss_tss)) +         # information gain is the highest at 2 clusters
  geom_point() + 
  geom_line() +
  annotate("path",
           x=2+0.1*cos(seq(0,2*pi,length.out=100)),
           y=0.17+0.05*sin(seq(0,2*pi,length.out=100)), color = "red") +
  ggtitle("Elbow plot")

km2_distrib = km2$centers %>% data.frame()
km2_distrib

a1 = bind_cols(a, cluster = factor(km2$cluster)) %>% data.frame()
a1$cluster = factor(a1$cluster) %>% relevel(ref = 2)



# --------------------------------------------------
# -------- Compare the two clusters ----------------
# --------------------------------------------------

# create a function of T Test - comparing two categories -----------------------
test_func = function(varname1, category, dataname) {
  #varname1 - variable of interest
  #varname2 - category 
  var2 = enquo(category)
  var1 = enquo(varname1)
  
  b0 = dataname %>% filter(!is.na(!!var2) & !is.na(!!var1) & !is.infinite(!!var1))
  b1 = dataname %>% filter(!!var2==2 & !is.na(!!var1) & !is.infinite(!!var1)) %>% select(!!var1)
  b2 = dataname %>% filter(!!var2==1 & !is.na(!!var1) & !is.infinite(!!var1)) %>% select(!!var1)
  
  a1 <- b0 %>% 
    group_by(!!var2) %>% 
    summarise(Mean=format(round(mean(!!var1, na.rm = TRUE), 2), nsmall = 2)) %>% 
    data.frame()
  
  t1 <- t.test(b1, b2)
  a2 <- paste0(format(round(t1$statistic, 2), nsmall = 2), 
               ifelse(t1$p.value<=0.01,"***",
                      ifelse(t1$p.value<=0.05,"**",
                             ifelse(t1$p.value<=0.1,"*",""))))
  
  a3 <- cbind.data.frame(Cluster1=a1[1,-1],Cluster2=a1[2,-1],t_test=a2)
  rownames(a3) <- colnames(b1)[1]
  return(a3)
}

m1 = test_func(fundingsource_Flood_insurance,   cluster, a1)
m2 = test_func(fundingsource_Family_friends,    cluster, a1)
m3 = test_func(fundingsource_Mysavings,         cluster, a1)
m4 = test_func(fundingsource_credit_card,       cluster, a1)
m5 = test_func(fundingsource_FEMA_grant,        cluster, a1)
m6 = test_func(fundingsource_SBA_loan,          cluster, a1)
m7 = test_func(fundingsource_bank_loan,         cluster, a1)
m8 = test_func(fundingsource_charity_nonprofit, cluster, a1)
m9 = test_func(fundingsource_Myemployer,        cluster, a1)
m10= test_func(fundingsource_Local_goverment,   cluster, a1)


bind_rows(m1, m2, m3, m4, m5, m6, m7, m8, m9, m10)
xtable(bind_rows(m1, m2, m3, m4, m5, m6, m7, m8, m9, m10))




